## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following object is masked from 'package:graphics':
##
## layout
Imported demographic, HIV test, and sexual behavior dataframes 1999 - 2016.
demo_list =
list.files(path = "NHANES/DEMO", full.names = TRUE)
hiv_list =
list.files(path = "NHANES/HIV", full.names = TRUE)
sxq_list =
list.files(path = "NHANES/SXQ", full.names = TRUE)
load_files = function(x){
filetype =
str_extract(x, "DEMO|HIV|SXQ")
year =
str_extract(x, "(?<=_)[A-Z]")
data =
read_xpt(x) |>
janitor::clean_names()|>
mutate(filetype = filetype, year = year)
}
demo_list_output =
map(demo_list, load_files)
hiv_list_output =
map(hiv_list, load_files)
sxq_list_output =
map(sxq_list, load_files)
join_pair = function(df1, df2) {
left_join(df1, df2, by = c("seqn", "year"))
}
hiv_demo_list =
map2(hiv_list_output, demo_list_output, ~join_pair(.x, .y))
hiv_demo_sxq_df =
map2(hiv_demo_list, sxq_list_output, ~join_pair(.x, .y)) |>
bind_rows()
Re-coded variables of interest: Restricted particpants’s age between 20 and 49
cleaned_joined_df =
hiv_demo_sxq_df |>
filter(ridageyr >= 20 & ridageyr <= 49) |>
mutate(hiv = case_when(
lbdhi == 1 | lbxhivc == 1 ~ 1,
lbdhi == 2 | lbxhivc == 2 ~ 2)
)|>
mutate(MSM = case_when(
year %in% c("A","B","C") ~ sxq220,
year %in% c("D","E","F","G","H","I") ~ sxq550)
)|>
mutate(WSW = case_when(
year %in% c("A","B","C") ~ sxq150,
year %in% c("D","E","F","G","H","I") ~ sxq490)
)|>
mutate(year = recode(year,
"A" = "1999-2000",
"B" = "2001-2002",
"C" = "2003-2004",
"D" = "2005-2006",
"E" = "2007-2008",
"F" = "2009-2010",
"G" = "2011-2012",
"H" = "2013-2014",
"I" = "2015-2016"),
hiv = recode(hiv,
"1" = "Reactive",
"2" = "Non-reactive"),
gender = recode(riagendr,
"1" = "Male",
"2" = "Female"),
age = ridageyr,
race = recode(ridreth1,
"1" = "Mexican American",
"2" = "Other Hispanic",
"3" = "Non-Hispanic White",
"4" = "Non-Hispanic Black",
"5" = "Other Race - Including Multi-Racial"),
education = recode(dmdeduc2,
"1" = "Less than 9th grade",
"2" = "9-11th grade",
"3" = "High school graduate or equivalent",
"4" = "Some college or AA degree",
"5" = "College graduate or above",
"7" = NA_character_,
"9" = NA_character_),
marriage = recode(dmdmartl,
"1" = "Married",
"2" = "Widowed",
"3" = "Divorced",
"4" = "Separated",
"5" = "Never married",
"6" = "Living with partner",
"77" = NA_character_,
"99" = NA_character_)
) |>
mutate(samesexcontact = case_when(
(MSM %in% c(0, 7777, 77777,9999, 99999) | WSW %in% c(0, 7777, 77777, 9999, 99999)) ~ 0,
(MSM >= 1 & MSM <= 600) | (WSW >= 1 & WSW <= 600) ~ 1
)) |>
select(seqn, hiv, age, gender, race, education, marriage, year, samesexcontact)
cleaned_joined_df |>
group_by(samesexcontact, hiv) |>
summarize(n = n())
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## # A tibble: 9 × 3
## # Groups: samesexcontact [3]
## samesexcontact hiv n
## <dbl> <chr> <int>
## 1 0 Non-reactive 6826
## 2 0 Reactive 33
## 3 0 <NA> 137
## 4 1 Non-reactive 572
## 5 1 Reactive 48
## 6 1 <NA> 15
## 7 NA Non-reactive 15589
## 8 NA Reactive 43
## 9 NA <NA> 1136
Visualization - 1 Total number as y-axis value Across race
cleaned_joined_df %>%
filter(hiv == "Reactive") %>%
group_by(race, gender) %>%
summarize(n = n(), age_mean = round(mean(age), 2)) %>%
ungroup() %>%
mutate(race = fct_reorder(race, n)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~race, y = ~n, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Race"),
yaxis = list(title = "Total Number of HIV Positive Patients"),
title = "The Total Number of HIV Positive Patients from 1999 to 2016 Across Races"
)
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
Visualization - 1-1 Total percent as y-axis value
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
) %>%
group_by(race, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(race = fct_reorder(race, hiv_percent)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~race, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Race"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Races"
)
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
Visualization 2: by year
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>%
group_by(year, gender) %>%
summarize(age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~year, y = ~hiv_percent, color = ~gender, type = "scatter", mode = "lines+markers",
text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Year"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016"
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
Visualization 3: Across Education level
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
) %>%
group_by(education, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade",
"High school graduate or equivalent",
"Some college or AA degree",
"College graduate or above"))) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~education, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Education level"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Education Level"
)
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations
Visualization 4: Across marital status
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>%
group_by(marriage, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(marriage = fct_reorder(marriage, hiv_percent)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~marriage, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Marital Status"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Marital Status"
)
## `summarise()` has grouped output by 'marriage'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations
Visualization 5: Sexual behavior
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv),
samesexcontact = recode(samesexcontact, "0" = "No",
"1" = "Yes")) %>%
group_by(samesexcontact, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2), hiv_num = sum(hiv == "Reactive"),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~samesexcontact, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Same Sex Sexual Behavior"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Sexual Behavior"
)
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## Warning: Ignoring 2 observations
set the reference group
cleaned_regression_df=
cleaned_joined_df |>
mutate(
hiv_outcome = ifelse(hiv == "Reactive", 1, ifelse(hiv == "Non-reactive", 0, NA)),
education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade", "High school graduate or equivalent", "Some college or AA degree", "College graduate or above")),
race = forcats::fct_relevel(race, c("Non-Hispanic White", "Mexican American", "Non-Hispanic Black", "Other Hispanic", "Other Race - Including Multi-Racial")),
marriage = forcats::fct_relevel(marriage, c("Married", "Widowed", "Divorced", "Separated", "Never married", "Living with partner"))
)
This is the full model
fit_alt = cleaned_regression_df |>
glm(hiv_outcome ~ samesexcontact + gender + age + race + education + marriage, data = _, family = binomial())
test samesexcontact, gender,
age
fit_alt|>
broom::tidy() |>
mutate(OR = exp(estimate)) |>
select(term, estimate, OR, p.value)
## # A tibble: 17 × 4
## term estimate OR p.value
## <chr> <dbl> <dbl> <dbl>
## 1 (Intercept) -10.1 0.0000403 3.88e-24
## 2 samesexcontact 2.95 19.1 8.25e-26
## 3 genderMale 1.90 6.67 6.25e-10
## 4 age 0.0583 1.06 1.55e- 4
## 5 raceMexican American 0.295 1.34 5.38e- 1
## 6 raceNon-Hispanic Black 1.97 7.17 3.25e- 9
## 7 raceOther Hispanic 1.08 2.96 2.45e- 2
## 8 raceOther Race - Including Multi-Racial -14.3 0.000000618 9.78e- 1
## 9 education9-11th grade -0.384 0.681 5.76e- 1
## 10 educationHigh school graduate or equivalent -0.360 0.698 5.97e- 1
## 11 educationSome college or AA degree -0.304 0.738 6.48e- 1
## 12 educationCollege graduate or above -1.11 0.328 1.24e- 1
## 13 marriageWidowed 1.99 7.34 7.29e- 2
## 14 marriageDivorced 1.12 3.08 4.02e- 2
## 15 marriageSeparated 1.39 4.01 2.23e- 2
## 16 marriageNever married 1.37 3.95 1.40e- 3
## 17 marriageLiving with partner 1.08 2.93 3.36e- 2
samesexcontact, gender, age
are all significanttest whether race is significant
fit_no_race =
glm(hiv_outcome ~ samesexcontact + gender + age + education + marriage, data = na.omit(cleaned_regression_df), family = binomial())
anova(fit_no_race, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
## term df.residual residual.deviance df deviance p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hiv_outcome ~ samesexc… 7243 639. NA NA NA
## 2 hiv_outcome ~ samesexc… 7239 581. 4 58.4 6.26e-12
race is significanttest whether education is significant
fit_no_education =
glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = na.omit(cleaned_regression_df), family = binomial())
anova(fit_no_education, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
## term df.residual residual.deviance df deviance p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hiv_outcome ~ samesexcon… 7243 587. NA NA NA
## 2 hiv_outcome ~ samesexcon… 7239 581. 4 5.65 0.227
education is not significanttest whether marriage is significant
fit_no_marriage =
glm(hiv_outcome ~ samesexcontact + gender + age + race + education, data = na.omit(cleaned_regression_df), family = binomial())
anova(fit_no_marriage, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
## term df.residual residual.deviance df deviance p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hiv_outcome ~ samesexcon… 7244 595. NA NA NA
## 2 hiv_outcome ~ samesexcon… 7239 581. 5 13.7 0.0174
marriage is significantsamesexcontact,
gender, age, race,
marriageThe final model
fit_regression = cleaned_regression_df |>
glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = _, family = binomial())
fit_regression|>
broom::tidy() |>
mutate(OR = exp(estimate)) |>
select(term, estimate, OR, p.value)|>
knitr::kable(digits = 3)
| term | estimate | OR | p.value |
|---|---|---|---|
| (Intercept) | -10.460 | 0.000 | 0.000 |
| samesexcontact | 2.834 | 17.020 | 0.000 |
| genderMale | 1.820 | 6.171 | 0.000 |
| age | 0.056 | 1.058 | 0.000 |
| raceMexican American | 0.455 | 1.576 | 0.324 |
| raceNon-Hispanic Black | 2.072 | 7.941 | 0.000 |
| raceOther Hispanic | 1.245 | 3.473 | 0.009 |
| raceOther Race - Including Multi-Racial | -14.286 | 0.000 | 0.978 |
| marriageWidowed | 2.036 | 7.661 | 0.066 |
| marriageDivorced | 1.138 | 3.121 | 0.036 |
| marriageSeparated | 1.406 | 4.080 | 0.020 |
| marriageNever married | 1.341 | 3.823 | 0.002 |
| marriageLiving with partner | 1.033 | 2.811 | 0.040 |
bootstrap_df =
cleaned_regression_df |>
bootstrap(n = 500) |>
mutate(
models = map(.x = strap, ~glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = .x, family = binomial()) ),
results = map(models, broom::tidy)) |>
select(-strap, -models) |>
unnest(results) |>
group_by(term) |>
mutate(OR = exp(estimate))
## Warning: There were 177 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `models = map(...)`.
## Caused by warning:
## ! glm.fit: fitted probabilities numerically 0 or 1 occurred
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 176 remaining warnings.
Look at the distribution of estimates under repeated sampling
bootstrap_df|>
filter(term == "age") |>
ggplot(aes(x = estimate))+
geom_density() +
facet_wrap(.~ term)
bootstrap_df|>
filter(term != "age") |>
ggplot(aes(x = estimate))+
geom_density() +
facet_wrap(.~ term)
Compute bootstrap mean OR and 95% CI
bootstrap_df|>
summarize(
mean_OR = mean(OR),
CI_lower = quantile(OR, 0.025),
CI_upper = quantile(OR, 0.975)
)|>
knitr::kable(digits = 3)
| term | mean_OR | CI_lower | CI_upper |
|---|---|---|---|
| (Intercept) | 0.000 | 0.000 | 0.000 |
| age | 1.060 | 1.031 | 1.091 |
| genderMale | 7.006 | 3.357 | 13.656 |
| marriageDivorced | 3.845 | 0.908 | 10.485 |
| marriageLiving with partner | 3.471 | 1.018 | 10.308 |
| marriageNever married | 4.657 | 1.654 | 13.025 |
| marriageSeparated | 5.139 | 0.751 | 14.666 |
| marriageWidowed | 11.892 | 0.000 | 55.766 |
| raceMexican American | 1.745 | 0.513 | 3.860 |
| raceNon-Hispanic Black | 8.788 | 4.505 | 16.707 |
| raceOther Hispanic | 3.894 | 1.246 | 9.348 |
| raceOther Race - Including Multi-Racial | 0.000 | 0.000 | 0.000 |
| samesexcontact | 18.716 | 10.444 | 32.479 |